---
title: "Cleaning Arab Opinion Index"
---

# Load

```{r}
# load packages
  source("helper-packages.R")

# load arab opinion index: 2019-2020
  aoi_2019_2020_raw <-
    read_dta("../raw-data/y-multi-arab-opinion-index/aoi-2019-2020/AOI-DATA-2019-2020-EN-STATA.dta", encoding = "latin1")
```

Important: We checked the following rounds and confirmed they don't have a religion variable
Years: 2011, 2012-2013, 2014, 2015, 2016, 2017-2018

# Clean Arab Opinion Index 2019-2020

```{r}
# dates
  aoi2019_2020_dates <- 
    tribble(
      ~resp_country_common, ~resp_interview_start_date, ~resp_interview_end_date,
      "Tunisia", "03-11-2019", "15-12-2019", 
      "Sudan", "01-12-2019", "31-12-2019",
      "Palestine", "01-12-2019", "29-12-2019",
      "Morocco", "05-12-2019", "19-01-2020",
      "Mauritania", "17-12-2019", "31-01-2020",
      "Saudi Arabia", "27-12-2019", "15-04-2020",
      "Kuwait", "21-12-2019", "11-01-2020",
      "Egypt", "05-01-2020", "04-02-2020",
      "Jordan", "22-01-2020", "02-02-2020",
      "Lebanon", "26-02-2020", "20-04-2020",
      "Algeria", "25-05-2020", "20-07-2020",
      "Iraq", "31-05-2020", "12-07-2020",
      "Qatar", "01-07-2020", "20-08-2020") %>% 
    mutate(
      resp_interview_start_date = as.Date(resp_interview_start_date, "%d-%m-%Y"),
      resp_interview_end_date = as.Date(resp_interview_end_date, "%d-%m-%Y"))

# clean
  clean_aoi2019_2020 <- 
    aoi_2019_2020_raw %>% 
    mutate(
      
    #########################  
    ####### META-DATA #######  
    #########################      
      
      # source name (character vector, title case)
        resp_source = "Arab Opinion Index",
        
      # round number (character vector, title case)  
        resp_round = "Wave 7",
      
      # url to dataset source, where publicly available (character vector)
        resp_original_data_url = "bit.ly/3BW50O3",

      # survey mode (in-person/phone/internet)
        resp_survey_mode = "in-person", # confirmed (website)

      # country (character vector; list of countries as written in original source)
        resp_country_original = to_character(Q1), # label value 14 (Yemen) and 15 (Syrian refugees) are in the attributes but not in the data set

      # country (character vector; converts to countrycode county.name list)
        resp_country_common = 
          countryname(resp_country_original),
        
      # interview date (variable of class Date; if only month given, input 1st of month)
        resp_interview_date = as.Date(NA_character_)) %>% 
        left_join(
          aoi2019_2020_dates, by = "resp_country_common") %>% 
        mutate(
   
    #########################  
    ##### DEMOGRAPHICS ######  
    #########################
      
      # respondent's religion (character vector that corresponds to master list)
        resp_denomination = to_character(Q1216),   # huge missingness: all missing for Jordan
        resp_denomination = str_replace_all(resp_denomination, "[^[A-Za-z,]]", ""), # remove special characters
    
      # respondent's religion (character vector that corresponds to master list)
        resp_religion = 
          dplyr::recode(
            as.character(Q1215),
            "1" = "Muslim",
            "2" = "Christian",
            "3" = "Other religion",
            "4" = "Other religion", # Druze
            "97" = NA_character_,
            .default = NA_character_),
    
      # fix errors in resp_religion (assuming extra info provided in resp_denomination is correct)
        resp_religion = 
          case_when(
            resp_denomination %in% c("Shii", "Sunni") ~ "Muslim",
            resp_denomination %in% c("GreekOrthodox") ~ "Christian",
            TRUE ~ resp_religion),
    
      # respondent's age (character vector; bins denoted by single dash ["18-25"]) 
        resp_age = 
          case_when( # interval; no NAs
            Q1201 == 1 ~ "18-24",
            Q1201 == 2 ~ "25-34",
            Q1201 == 3 ~ "35-44",
            Q1201 == 4 ~ "45-54",
            Q1201 == 5 ~ "55+",
            Q1201 == 97 ~ NA_character_,
            TRUE ~ NA_character_),
      
      # respondent's education level
        resp_education_original =
          dplyr::recode(
            as.character(Q1206),
            "1" = "1. Illiterate/limited [No education]",
            "2" = "2. Less than secondary [Primary]", # not sure if this should be [Primary]
            "3" = "3. Secondary [Primary]",
            "4" = "4. Higher than secondary [College]", # this is an assumption, but best possible given coarseness
            "97" = NA_character_,
            .default = NA_character_),       
      
      # respondent's gender (numeric: female = 1; male = 0; other = NA)
        resp_female = (Q1202 == 2)*1,
      
      # respondent resident in rural (vs urban) area (numeric: rural = 1; urban/semi-urban/peri-urban = 0)
        resp_rural = 
          case_when( # no NAs
            Q4 == 1 ~ 0,
            Q4 == 2 ~ 1,
            TRUE ~ NA_real_),
      
    #########################  
    ### SOCIAL DISTANCE 1 ###  
    #########################
    
      # original question number; question text; response options (input above)
        resp_soc_dist_1_qinfo = "NUM: 2020.56 [Q2020.56.5]; QTEXT: Would you accept or oppose your daughter or your sister marrying a person from another sect or confession?; ROPTIONS: 1 = Accept [=0] + 2 = Oppose [=1]; TARGET: Different sect, general; TYPE: Distance, family",
      
      # original response (as character vector)
        resp_soc_dist_1_original = 
          dplyr::recode(
            as.character(Q2020_56_5),
            "6" = NA_character_,
            "7" = NA_character_),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_1_bin_recode = 
          case_when(
            Q2020_56_5 %in% c(1) ~ 0,
            Q2020_56_5 %in% c(2) ~ 1,
            TRUE ~ NA_real_),
    
    #########################  
    ### SOCIAL DISTANCE 2 ###  
    #########################
    
      # original question number; question text; response options (input above)
        resp_soc_dist_2_qinfo = "NUM: 425 [Q425.1]; QTEXT: Which of the following groups would you not like to have as neighbors: Followers of different religions.; ROPTIONS: 1 = I do not want them as neighbors [=1] + 2 = I am impartial [=0]; TARGET: Different religion; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_2_original = 
          dplyr::recode(
            as.character(Q425_1),
            "6" = NA_character_,
            "7" = NA_character_,
            "9" = NA_character_),       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_2_bin_recode = 
          case_when(
            Q425_1 %in% c(1) ~ 1,
            Q425_1 %in% c(2) ~ 0,
            TRUE ~ NA_real_),
    
    ############################  
    ### GENERAL SOCIAL TRUST ###  
    ############################
    
      # original question number; question text; response options (input above)
        resp_gentrust_qinfo = NA_character_, # checked
    
      # original response (as character vector)
        resp_gentrust_original = NA_character_, # checked       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_gentrust_bin_recode = NA_real_,  # checked
    
    #########################  
    ##### RELIGIOSITY #######  
    #########################
    
      # original question number; question text; response options (input above)
        resp_religiosity_qinfo = "NUM: 602 [Q602]; QTEXT: Regardless of whether or not you attend places of worship, do you consider yourself to be...;  ROPTIONS: 1 = Very religious (religiously observant) + 2 = Religious (religiously observant) to some extent + 3 = Not religious (not religiously observant) + 4 = A non-believer",
  
      # original response (as numeric vector, with non-substantive responses coded as NA_real_)
        resp_religiosity_original = 
          case_when(
            Q602 %in% c(6, 7, 9) ~ NA_real_,
            TRUE ~ as.numeric(Q602)),       

      # recode (numeric: scaled 0-1, where 1 is more religious)
        resp_religiosity_recode = (4 - resp_religiosity_original)/3,
    
    ) %>% 
    select(starts_with("resp_"))
```

# Save data

```{r}
  saveRDS(clean_aoi2019_2020, "../cleaned-data/y-24-multi-arab-opinion-index.rds")
```
